library(dplyr)
library(data.table)
library(stringr)

recode_to_7 <- function(first, second, third, pro_tech_text, anti_tech_text, in_between_text){
  out <- rep(NA, length(df))
  out[first == 'Very close'] <- 1
  out[first == 'Somewhat close'] <- 2
  out[second == pro_tech_text] <- 3
  out[second == in_between_text] <- 4
  out[second == anti_tech_text] <- 5
  
  out[third == "Somewhat close"] <- 6
  out[third == "Very close"] <- 7
  out
}


import_video_wave_1 <- function(){
  df <- fread('Data/video-wave-1.csv', colClasses='character')
  opt_in <- df$Q1.2 == 'I agree to participate'
  mean(opt_in)
  df <- df[opt_in,]
  
  pass_av <- (df$Q3.4 == 'PG Tips' & df$Q3.5 == 'Delicious')|(df$Q2.4 == 'PG Tips' & df$Q2.5 == 'Delicious')
  
  mean(df$Q3.4 == 'PG Tips' & df$Q3.5 == 'Delicious')|(df$Q2.4 == 'PG Tips' & df$Q2.5 == 'Delicious')
  
  pass_vid <- df$Q3.4 == 'PG Tips'|df$Q2.4 == 'PG Tips'
  pass_audio <- df$Q3.5 == 'Delicious' |df$Q2.5 == 'Delicious'
  
  prop.table(table(pass_vid, pass_audio))
  
  prop.table(table(df$Q3.4 == 'PG Tips', df$Q3.5 == 'Delicious'))
  prop.table(table(df$Q2.4 == 'PG Tips', df$Q2.5 == 'Delicious'))
  
  mean(pass_av)
  df <- df[pass_av,]
  
  pass_easy <- df$Q4.1 == 15
  mean(pass_easy, na.rm=T)
  
  df <- df[pass_easy,]
  df <- df[df$treatment != '',]
  
  
  pass_att1 <- (df$Q5.4_1 == 'Every day' & df$Q5.4_5=='Never') & 
    rowSums(df[,grepl('Q5.4', colnames(df)), with=F] != '') ==2
  pass_att2 <- (df$Q5.7_5 == 'Classified' & df$Q5.7_12 =='None of the above') & 
    rowSums(df[,grepl('Q5.7', colnames(df)), with=F] != '') ==2
  pass_att3 <- df$grid_screen_3 == 'Neither agreenor disagree'
  
  break_up <- df[,grepl( 'Q13.9', colnames(df)), with=F]
  break_tech <- rowSums(break_up[,1:5] != '')
  break_placebo <- rowSums(break_up[,6:9] != '')
  
  fav <- df[,grepl( 'Q17.1', colnames(df)), with=F]
  fav <- apply(fav, 2, function(x){ recode(x, 'Very favorable'=7, 
                                           'Favorable'=6, 
                                           'Somewhat favorable'=5,
                                           'Neither favorable nor unfavorable'=4,
                                           'Somewhat unfavorable'=3,
                                           'Unfavorable'=2,
                                           'Very unfavorable'=1,
                                           .default = NA_real_)})
  fav_tech <- rowMeans(fav[,1:5])
  fav_plac <- rowMeans(fav[,6:ncol(fav)])
  
  
  scale <- recode_to_7(first=df$Q13.3, second=df$Q13.7, third=df$Q13.5, 
                       pro_tech_text='The scale and efficiency of large tech companies like Apple, Amazon, Facebook, and Google benefits consumers more than it hurts them',
                       anti_tech_text='Large tech companies like Apple, Amazon, Facebook, and Google use their size to gain an unfair advantage over competitors and disadvantage consumers',
                       in_between_text='Both come equally close to my view')
  
  censorship <- recode_to_7(first=df$Q14.5, second=df$Q14.7, third=df$Q14.3, 
                            pro_tech_text='Social networks like Facebook and Twitter should allow their users to freely express their views, even if it means allowing false, offensive, or harmful content to circulate',
                            anti_tech_text='Social networks like Facebook and Twitter should do more to remove false, offensive, or harmful content from their platforms',
                            in_between_text='Truly unsure')
  
  privacy <- recode_to_7(first=df$Q15.3, second=df$Q15.7, third=df$Q15.5, 
                         pro_tech_text='Big tech companies do a good job of keeping their users’ information secure',
                         anti_tech_text="Big tech companies do not do a good job of keeping their users’ information secure",
                         in_between_text='Truly unsure')
  
  
  congress <- recode_to_7(first=df$Q15.13, second=df$Q15.15, third=df$Q15.11, 
                          pro_tech_text="Congress should allow large tech companies to store and use data on their users as they see fit",
                          anti_tech_text="Congress should more actively regulate how large tech companies gather and store data on their users",
                          in_between_text='Truly unsure')
  
  
  influence <- recode_to_7(first=df$Q16.4, second=df$Q16.5, third=df$Q16.3, 
                           pro_tech_text="The influence of big tech companies on political life in America is often exaggerated",
                           anti_tech_text="Big tech companies exert too much influence over the political life in America" ,
                           in_between_text='Truly unsure')
  
  
  
  treat_supports <- recode(df$Q11.2,
                     'Definitely supports'=5,
                     'Somewhat supports'=4,
                     'Neither opposes nor supports'=3,
                     'Somewhat opposes'=2,
                     'Definitely opposes'=1,
                     .default = NA_real_)
  
  treat_effective <- recode(df$Q11.4,
                     'Definitely effective'=5,
                     'Effective'=4,
                     'Not sure'=3,
                     'Not effective'=2,
                     'Definitely not effective'=1,
                     .default = NA_real_)
  
  understand_tech <- recode(df$Q11.6,
                            'Very well'=4,
                            'Fairly well'=3,
                            'Not very well'=2,
                            'Not at all'=1,
                            .default = NA_real_)
  
  treat_liberal <- recode(df$Q11.8,
                          'Very liberal'=7,
                          'Somewhat liberal'=6,
                          'Slightly liberal'=5,
                          'Neither liberal nor conservative'=4,
                          'Slightly conservative'=3,
                          'Somewhat conservative'=2,
                          'Very conservative'=1,
                          .default = NA_real_)
  
  
  pid7 <- rep(NA, nrow(df))
  pid7[df$Q18.4 == 'Strong Democrat'] <- 7
  pid7[df$Q18.4 == 'Not very strong Democrat'] <- 6
  
  pid7[df$Q18.6 == 'Strong Republican'] <- 1
  pid7[df$Q18.6 == 'Not very strong Republican'] <- 2
  
  pid7[df$Q18.8 == 'Closer to the Republican Party'] <- 3
  pid7[df$Q18.8 == 'Closer to the Democratic Party'] <- 5
  pid7[df$Q18.8 == 'Neither'] <- 4
  
  pid3_leaner <- rep(NA, nrow(df))
  pid3_leaner[pid7 %in% c(7, 6, 5)] <- "Democrat"
  pid3_leaner[pid7 == 4] <- "Independent"
  pid3_leaner[pid7 %in% c(1, 2, 3)] <- "Republican"
  
  ideo <- recode(df$Q18.10, 
                 'Very liberal'=1,
                 'Liberal'=2,
                 'Moderate'=3,
                 'Conservative'=4,
                 'Very conservative'=5,
                 .default = NA_real_)
  
  income <- recode(df$Q18.12, 
                   "Less than $10,000"=1,
                   "$10,000 - $19,999"=2,
                   "$20,000 - $29,999" =3,
                   "$30,000 - $39,999"=4,
                   "$40,000 - $49,999"=5,
                   "$50,000 - $59,999"=6,   
                   "$60,000 - $69,999" =7,
                   "$70,000 - $79,999"=8,
                   "$80,000 - $89,999" =9,  
                   "$90,000 - $99,999"=10,
                   "$100,000 - $119,999"=11,
                   "$120,000 - $149,999"=12,
                   "$150,000 - $199,999"=13,
                   "$200,000 - $249,999"=14,
                   "$250,000 - $349,000" =15,           
                   "$350,000 - $499,000"=16,
                   "$500,000 or more" = 17,
                   .default = NA_real_)
  
  age <- 2021 - as.numeric(df$Q5.2)
  edu <- recode(df$Q5.3,
                'Did not graduate from high school'=1,
                'High school'=2,
                'Some college, no degree'=3,
                'Two-year degree'=4,
                'Four-year degree'=5,
                'Graduate degree'=6,
                .default = NA_real_)
  
  pid3 <- df$Q18.2#,'1'='Democrat', '2'='Independent', '3'='Republican', '4'="Other Party")
  
  man <- df$Q5.9 == 'Man'
  gender <- recode(df$Q5.9, 'Man'='Male', "Woman"="Female", 'Other (please specify)'='Other', .default = NA_character_)
  
  race <- as.data.frame(df[,grepl('Q5.11', colnames(df)), with=F] != '')
  colnames(race) <- c('native-american', 'asian', 'black', 'hispanic', 'white', 'middle-eastern', 'other')
  
  med_pref <- str_extract(df$Q5.6, 'MSNBC|Fox News|Food Network') 
  
  familiar <- apply(df[,grepl('Q11.10', colnames(df)), with=F], 2, recode, 'Extremely familiar'=5, 'Very familiar'=4,
                    'Moderately familiar'=3, 'Slightly familiar'=2, 'Not familiar at all'=1, .default = NA_real_) %>%
    rowMeans()
  
  outcomes <- data.frame(scale, censorship, privacy, congress, influence, fav_tech,break_tech)
  outcomes_index <- rowMeans(data.frame(1-(scale/7), censorship/7, 1-(privacy/7), 1-(congress/7), 1-(influence/7), fav_tech/7,1-(break_tech/5)))
  complete_df <- df[complete.cases(outcomes),]
  complete_df$outcomes_index <- outcomes_index[!is.na(outcomes_index)]
  
  complete_df$pc <- -prcomp(outcomes[complete.cases(outcomes),], retx=T)$x[,1]
  complete_df$pc <- complete_df$pc/sd(complete_df$pc)
  if(cor(complete_df$pc,complete_df$outcomes_index)<0){
    complete_df$pc <- -complete_df$pc
  }
  to_return <- data.frame(EndDate=df$EndDate, break_tech, break_placebo, fav_tech, fav_plac, treat_supports, 
                          treat_effective, understand_tech, treat_liberal, pass_att1, pass_att2, pass_att3, age,
                          ideo, pid3, pid7, pid3_leaner, income, race, med_pref, man, gender, race,edu, PID=df$PID, treatment=df$treatment, 
                          scale, censorship, privacy, congress, influence, fav_tech)
  to_return$att <- pass_att1 + pass_att2 + pass_att3
  
  to_return <- to_return[!duplicated(to_return$PID),]
  to_return <- left_join(to_return, complete_df[!duplicated(complete_df$PID),c('pc', 'PID')])
  
  to_return
}
